This script reads in both PBMC and Liver Seurat objects, and generates select figures used in the manuscript.
Load libraries
library(Seurat)
library(scClustViz)
library(ggplot2)
library(dplyr)
library(rcartocolor)
library(SeuratWrappers)
library(scCustomize)
Read in liver map
sobj <- readRDS("~/Dropbox/Zoe/scf_version/analysis/healthy_sc/seurat_objects/dropletQC_filtered/allIntegrated_cca_kanchor5_noBiopsyHeps_dropletQCFiltered.RDS")
res <- "integrated_snn_res.1.4"
Idents(sobj) <- res
tissue <- "liver"
Read in PBMC map
load("~/Dropbox/Zoe/scf_version/analysis/healthy_sc/seurat_objects/no_dropletQC/integrated_PBMC_cca_kanchor5_scClustViz.RData")
sobj <- scSeurat
res <- "integrated_snn_res.0.6"
Idents(sobj) <- res
tissue <- "PBMC"
UMAP with no cluster numbers
plot <- DimPlot(sobj, label = FALSE) & NoLegend()
plot
pdf(paste("./figures/", tissue, "/", tissue, "_UMAP_clusters_noLabels.pdf", sep = ""))
plot
dev.off()
png
2
UMAP with cluster numbers
plot <- DimPlot(sobj, label = TRUE)
plot
pdf(paste("./figures/", tissue, "/", tissue, "_UMAP_clusters_labels.pdf", sep = ""))
plot
dev.off()
png
2
Map with SCINA-generated cell-type labels
DimPlot(sobj, group.by = "scina_labels_refined", label = TRUE) & NoLegend()
Map with general cell-type labels for paper
plot <- DimPlot(sobj, group.by = "general_cell_labels",
label = TRUE, repel = TRUE) +
ggtitle(NULL)
plot
pdf(paste("./figures/", tissue, "/", tissue, "_UMAP_general_cell_labels.pdf", sep = ""),
height = 8,
width = 12)
plot
dev.off()
png
2
Map grouping by general cell type labels but with no labels on plot
plot <- DimPlot(sobj, group.by = "general_cell_labels") +
ggtitle(NULL) &
NoLegend()
plot
pdf(paste("./figures/", tissue, "/", tissue, "_UMAP_general_cell_labels_noLabels.pdf",
sep = ""))
plot
dev.off()
png
2
Map with original identities
plot <- DimPlot(sobj, group.by = "orig.ident",
cols = carto_pal(length(levels(as.factor(sobj$orig.ident))), "Safe")) +
ggtitle(NULL)
plot
pdf(paste("./figures/", tissue, "/", tissue, "_UMAP_orig_idents.pdf",
sep = ""),
height = 5, width = 7)
plot
dev.off()
png
2
Barplot with original identities on a cluster-level grouping:
# Meta data to plot:
df <- sobj@meta.data
# Check what column the cluster identities are in
col <- which(colnames(df) == res)
# Order clusters
df[,col] <- factor(Idents(sobj),
levels = c(sort(as.numeric(levels(Idents(sobj))))))
# Basic plot of clusters by replicate
ggplot(df, aes(x = get(res), fill = orig.ident)) +
geom_bar() +
theme(axis.text = element_text(size = 7))
# Plot as proportion or percentage of cluster
ggplot(df, aes(x = get(res), fill = orig.ident)) +
geom_bar(position = "fill") +
theme(axis.text = element_text(size = 7))
Barplot with original identities on a grouped by general cell labels:
df <- sobj@meta.data
plot1 <- ggplot(df, aes(x = general_cell_labels, fill = orig.ident)) +
geom_bar() +
scale_fill_carto_d(name = NULL, palette = "Safe") +
theme_bw() +
theme(axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.title.x = element_blank()) +
ylab("Number of cells")
plot1
pdf(paste("./figures/", tissue, "/", tissue, "_barplot_orig_ident_counts.pdf",
sep = ""))
plot1
dev.off()
png
2
# Plot as proportion or percentage of cluster
plot2 <- ggplot(df, aes(x = general_cell_labels, fill = orig.ident)) +
geom_bar(position = "fill") +
scale_fill_carto_d(name = NULL, palette = "Safe") +
theme_bw() +
theme(axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.title.x = element_blank()) +
ylab("Proportion of cells")
plot2
pdf(paste("./figures/", tissue, "/", tissue, "_barplot_orig_ident_proportions.pdf",
sep = ""))
plot2
dev.off()
png
2
Generate dotplot with specific markers
DotPlot(sobj,
assay = "SCT",
features = c("PTPRC", "CALCRL", "NKG7", "CD3E", "MARCO", "LYZ-1", "CD19", "MS4A1", "STAB2")
) +
ggtitle("Select features for liver map")
Calculate markers for general cell labels then reset resolution
Idents(sobj) <- "general_cell_labels"
sobj_markers <- RunPrestoAll(sobj,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.25)
Calculating cluster B cells
Calculating cluster CD3+/LEF1+/TCF7+ memory T cells
Calculating cluster CD14-/CD16+ Non-classical monocytes
Calculating cluster Erythrocytes
Calculating cluster CD3+/GATA3+/MAF+ TH2-like T cells 1
Calculating cluster CD14-/CD16- Monocytes
Calculating cluster CD3+/CD8A+/NKG7+ NK-like cells
Calculating cluster CD3+/GATA3+/MAF+ TH2-like T cells 2
Calculating cluster CD14+/CD16+ Monocytes
Calculating cluster Antibody-secreting B cells
Calculating cluster Unknown
Calculating cluster Mast cells
Calculating cluster CD3+/MKI67+/TOP2A+ Proliferating T cells
Calculating cluster Hematopoietic stem cells
Calculating cluster Megakaryocytes
sobj_markers %>%
group_by(cluster) %>%
slice_max(n = 2, order_by = avg_log2FC)
Idents(sobj) <- res
Save markers
groups = "general_cell_labels"
write.table(sobj_markers,
file = paste("./figures/", tissue, "/",
tissue, "_markers_", groups, ".tsv",
sep = ""),
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE)
Generate heatmap with top 5 markers grouping by general cell types
# Remove mikado genes from marker list
sobj_markers <- sobj_markers[grep("mikado", rownames(sobj_markers), invert = TRUE),]
sobj_markers %>%
group_by(cluster) %>%
top_n(n = 5, wt = avg_log2FC) -> top
# If liver, select fewer cells
if (tissue == "liver") {
cells <- sample(colnames(sobj), size = 30000)
} else if (tissue == "PBMC") {
cells <- colnames(sobj)
}
DoHeatmap(sobj, features = top$gene, group.by = "general_cell_labels", size = 3,
angle = 90, cells = cells) +
NoLegend() +
theme(text = element_text(size = 7))
Make PDF of heatmap
groups <- "general_cell_labels"
pdf(paste("./figures/", tissue, "/", tissue, "_heatmap_", groups, ".pdf", sep = ""),
height = 13,
width = 7)
DoHeatmap(sobj, features = top$gene, group.by = "general_cell_labels", size = 2,
angle = 90, cells = cells) +
NoLegend() +
theme(text = element_text(size = 7))
dev.off()
null device
1
Make specific plots with specific genes. The genes we are interested in include: PTPRC, CALCRL, NKG7, CD3E, MARCO, LYZ, CD19, MS4A1, STAB2, ALB, CD4, CD8A, CLEC4G, CD5L, C1QB, ACTA2, VWF, IGLL5, CD68. Can also plot in italics.
geneCode <- "sct_LYZ-1" # Woodchuck-specific nomenclature for this genome
gene <- "LYZ"
mapType <- "Liver"
FeaturePlot(sobj, features = geneCode) +
ggtitle(paste(gene, "-", mapType, "map"))
FeaturePlot(sobj, features = geneCode) +
ggtitle(bquote(~italic(.(gene))))
Another version of the feature plot that outputs genes in italics
if (tissue == "PBMC") {
geneCodes <- c("sct_PTPRC","sct_NKG7","sct_CD14",
"sct_CD3E","sct_MARCO","sct_LYZ-1",
"sct_CD19","sct_MS4A1","sct_STAB2",
"sct_CD4","sct_CD8A", "sct_XCL1;XCL2",
"sct_CD5L","sct_C1QB", "sct_LEF1",
"sct_ACTA2","sct_VWF","sct_IGLL5-1",
"sct_CD68","sct_FCGR3A;FCGR3B","sct_TOP2A")
genes <- c("PTPRC","NKG7","CD14","CD3E","MARCO","LYZ",
"CD19","MS4A1","STAB2","CD4","CD8A", "XCL1;XCL2",
"CD5L","C1QB","LEF1","ACTA2","VWF","IGLL5","CD68",
"FCGR3A;FCGR3B","TOP2A")
} else if (tissue == "liver") {
geneCodes <- c("sct_Ptprc","sct_CALCRL","sct_NKG7",
"sct_CD3E","sct_MARCO","sct_LYZ-1",
"sct_CD19","sct_MS4A1","sct_STAB2",
"sct_ALB-1","sct_CD4","sct_CD8A",
"sct_CLEC4G","sct_CD5L","sct_C1QB",
"sct_ACTA2","sct_VWF","sct_IGLL5-1",
"sct_CD68","sct_XCL1;XCL2","sct_LEF1",
"sct_RSPO3","sct_MECOM")
genes <- c("PTPRC","CALCRL","NKG7","CD3E","MARCO","LYZ",
"CD19","MS4A1","STAB2","ALB","CD4","CD8A",
"CLEC4G","CD5L","C1QB","ACTA2","VWF","IGLL5","CD68",
"XCL1;XCL2","LEF1","RSPO3","MECOM")
}
for(num in 2:length(geneCodes)) {
plot <- FeaturePlot(sobj,
features = geneCodes[num]) +
ggtitle(bquote(~italic(.(genes[num]))))
print(plot)
pdf(paste("./figures/", tissue, "/", tissue, "_", genes[num], "_UMAP.pdf", sep = ""))
print(plot)
dev.off()
}
Plot marker genes for specific cell populations
Endothelial cells:
geneCodes <- c("STAB2","ITGA1","CD55","LYVE1",
"CD34","VWF","IFITM3;IFITM2;IFITM1","RSPO3",
"MECOM","Mecom","CALCRL","LOC114089654",
"RAMP2","BST2","CLEC4G","CTSV;CTSL",
"STAB1","PLAC9","PECAM1","WNT2",
"MYL6","NR2F2")
genes <- c("STAB2","ITGA1","CD55","LYVE1",
"CD34","VWF","IFITM3","RSPO3",
"MECOM","Mecom","CALCRL","LOC114089654",
"RAMP2","BST2","CLEC4G","CTSV;CTSL",
"STAB1","PLAC9","PECAM1","WNT2",
"MYL6","NR2F2")
for(num in 1:length(geneCodes)) {
plot <- FeaturePlot_scCustom(sobj,
features = geneCodes[num],
colors_use = viridis_light_high) +
ggtitle(bquote(~italic(.(genes[num]))))
print(plot)
pdf(paste("./figures/", tissue, "/", tissue, "_", genes[num], "_UMAP.pdf", sep = ""))
print(plot)
dev.off()
}
Lymphocytes:
Lymphocytes weird genes:
Mesenchyme:
geneCodes <- c("HHIP","COL1A2","COL3A1","IGFBP7",
"IGFBP3","DCN","COL1A1","SPARC",
"RBP1", "CALCRL")
genes <- c("HHIP","COL1A2","COL3A1","IGFBP7",
"IGFBP3","DCN","COL1A1","SPARC",
"RBP1","CALCRL")
for(num in 1:length(geneCodes)) {
plot <- FeaturePlot_scCustom(sobj,
features = geneCodes[num],
colors_use = viridis_light_high) +
ggtitle(bquote(~italic(.(genes[num]))))
print(plot)
pdf(paste("./figures/", tissue, "/", tissue, "_", genes[num], "_UMAP.pdf", sep = ""))
print(plot)
dev.off()
}
Cholangiocytes:
geneCodes <- c("KRT19","CFTR","EPCAM","SLC4A4")
genes <- c("KRT19","CFTR","EPCAM","SLC4A4")
for(num in 1:length(geneCodes)) {
plot <- FeaturePlot_scCustom(sobj,
features = geneCodes[num],
colors_use = viridis_light_high) +
ggtitle(bquote(~italic(.(genes[num]))))
print(plot)
pdf(paste("./figures/", tissue, "/", tissue, "_", genes[num], "_UMAP.pdf", sep = ""))
print(plot)
dev.off()
}
Look at zonation gene signatures:
cv_genes <- c("FETUB","HMGCS1","CYP2E1","GLUD1;GLUD2","CYP1A2",
"RGN","INMT","COMT","FDPS","SPR-1")
cv_genes
[1] "FETUB" "HMGCS1" "CYP2E1" "GLUD1;GLUD2" "CYP1A2" "RGN"
[7] "INMT" "COMT" "FDPS" "SPR-1"
pp_genes <- c("Saa2;Saa1-1","HAMP","APOA1","APOC2","CRYL1",
"AMY1A;AMY1C;AMY1B;AMY2A;AMY2B","MT-ATP6","UROC1",
"APOA2","MT-CO3")
pp_genes
[1] "Saa2;Saa1-1" "HAMP"
[3] "APOA1" "APOC2"
[5] "CRYL1" "AMY1A;AMY1C;AMY1B;AMY2A;AMY2B"
[7] "MT-ATP6" "UROC1"
[9] "APOA2" "MT-CO3"
Endothelial cells:
# One set of features
geneCodes <- c("STAB2","ITGA1","LYVE1","IFITM3;IFITM2;IFITM1",
"CALCRL","RAMP2","BST2", "CLEC4G",
"CTSV;CTSL", "STAB1", "VWF", "CD34",
"MECOM","PLAC9","RSPO3","WNT2")
# Second set of features
geneCodes <- c("CLEC4G","STAB1","STAB2","LYVE1",
"CD34","MECOM","VWF","RSPO3",
"WNT2","ACKR1")
plot <- FeaturePlot_scCustom(sobj,
features = geneCodes,
colors_use = viridis_light_high,
num_columns = 4)
plot
pdf(paste("./figures/", tissue, "/", tissue, "_endothelialMarkers_UMAP.pdf", sep = ""),
height = 18, width = 24)
plot
dev.off()
png
2
T lymphocytes
geneCodes <- c("CD3D","CD3E","CD4","CD8A",
"CTLA4","IL7R-1","LEF1","EOMES",
"TIGIT","KLRB1","KLRD1","NKG7",
"XCL1;XCL2","TOX","GZMA","GZMK")
geneCodes <- c("CD3D","CD3E","GIMAP1","GIMAP4",
"GIMAP6","GIMAP7","CD4","CD28",
"FOXP1","CTLA4","CD38","XCL1;XCL2",
"IL2RA","NKG7","CD8A","CCL5",
"CD69","CD2","GZMK","BRCA2",
"TOP2A", "STMN1")
plot <- FeaturePlot_scCustom(sobj,
features = geneCodes,
colors_use = viridis_light_high,
num_columns = 4)
plot
pdf(paste("./figures/", tissue, "/", tissue, "_lymphocyteMarkers_UMAP.pdf", sep = ""),
height = ceiling(length(geneCodes)/4)*6, width = 24)
plot
dev.off()
png
2
T lymphocytes for PBMCs
geneCodes <- c("CD3D","CD3E","XCL1;XCL2","NKG7",
"LEF1","IL7R-1","GATA3","MAF",
"CCR4","S100A4","CXCR3","IL2RA",
"CD160","CD247","BRCA2","STMN1")
# CCR7 and TCF7 aren't showing up???
plot <- FeaturePlot_scCustom(sobj,
features = geneCodes,
colors_use = viridis_light_high,
num_columns = 4,
slot = "data")
plot
pdf(paste("./figures/", tissue, "/", tissue, "_lymphocyteMarkers_UMAP.pdf", sep = ""),
height = ceiling(length(geneCodes)/4)*6, width = 24)
plot
dev.off()
png
2
Mesenchyme:
geneCodes <- c("DCN","COL1A2","COL3A1","HHIP",
"CALCRL","SPARC","RBP1","COL1A1",
"IGFBP3", "IGFBP7")
geneCodes <- c("COL1A2", "COL3A1", "IGFBP7", "IGFBP3",
"DCN", "CALCRL", "COL1A1", "SPARC",
"RBP1", "HHIP","RBP1","LRAT",
"PDE3B","HGF","CNN2","ACTA2")
plot <- FeaturePlot_scCustom(sobj,
features = geneCodes,
colors_use = viridis_light_high,
num_columns = 4)
plot
pdf(paste("./figures/", tissue, "/", tissue, "_mesenchymalMarkers_UMAP.pdf", sep = ""),
height = 24, width = 24)
plot
dev.off()
png
2
Cholangiocytes:
geneCodes <- c("KRT19","SLC4A4","CFTR","EPCAM")
geneCodes <- c("ANXA2", "CST3", "BIRC3", "TESC",
"KRT19", "SOX9", "EPCAM","SLC4A4",
"CFTR")
plot <- FeaturePlot_scCustom(sobj,
features = geneCodes,
colors_use = viridis_light_high,
num_columns = 4)
plot
pdf(paste("./figures/", tissue, "/", tissue, "_cholangiocyteMarkers_UMAP.pdf", sep = ""),
height = ceiling(length(geneCodes)/4)*6, width = 24)
plot
dev.off()
png
2
Myeloid cells
geneCodes <- c("TYROBP","CD74","CTSS","VSIG4",
"MARCO", "CD5L","HMOX1","VCAM1",
"ITGAM","FCGR3A;FCGR3B","Fcgr3;Fcgr2b","CTSA",
"IL17RA-1","SIRPA;SIRPB1;SIRPG","BTLA", "CADM1",
"ID2","LY6E","FLT3", "CD14",
"NOTCH2","LYZ-1","S100A8")
plot <- FeaturePlot_scCustom(sobj,
features = geneCodes,
colors_use = viridis_light_high,
num_columns = 4)
Warning: The following features were omitted as they were not found:
ℹ MARCO, CD5L, HMOX1, VCAM1, CTSA, IL17RA-1, and LYZ-1
plot
pdf(paste("./figures/", tissue, "/", tissue, "_myeloidMarkers_UMAP.pdf", sep = ""),
height = ceiling(length(geneCodes)/4)*6, width = 24)
plot
dev.off()
png
2
Generate heatmaps comparing woodchuck clusters with various datasets
First, set up whatever woodchuck dataset I am working with by reading in the ortholog table, choosing whether the orthologs to use are human, mouse, or woodchuck, and calculating the average expression for each cluster. The output of this section is the scaled cluster gene-expression matrix
#groups <- res
groups <- "general_cell_labels"
# Read in ortholog table
geneNameTable <- read.table("~/Dropbox/Zoe/scf_version/make_gtf/orthofinder_sc2/homologene/collectedOrthofinderPairings.tsv",
sep = "\t",
header = TRUE)
woodchuckClusterAverages <- AverageExpression(sobj,
assays = "SCT",
slot = "scale.data",
group.by = groups)
Warning: `invoke()` is deprecated as of rlang 0.4.0.
# Scale data
woodchuckClusterAverages$SCT <- na.omit(t(scale(t(as.matrix(woodchuckClusterAverages$SCT)))))
# Grab gene names from Seurat object
uniqueHier <- row.names(woodchuckClusterAverages$SCT)
uniqueHier <- as.data.frame(uniqueHier)
# Bind with geneNameTable to get correct order (notice uniqueHier is on left)
newNames <- dplyr::left_join(uniqueHier, geneNameTable, by = "uniqueHier")
# Get orthologs from either mouse or human
species <- "human"
if (species == "human") {
# If human one-to-one ortho has NA, replace with mikado_final_sc2_stringent_noMito_protein column
# This is to avoid and potential mistakes in recognizing things it shouldn't be recognizing
newNames$speciesOneToOne <- ifelse(is.na(newNames$humanOneToOne), newNames$uniqueHier, newNames$humanOneToOne)
} else if (species == "mouse") {
newNames$speciesOneToOne <- ifelse(is.na(newNames$mouseOneToOne), newNames$uniqueHier, newNames$mouseOneToOne)
} else if (species == "woodchuck") {
newNames$speciesOneToOne <- newNames$uniqueHier
}
# Grab dataframe
woodchuckClusterAverages <- woodchuckClusterAverages$SCT
# Replace names with one-to-one orthologue of particular species
row.names(woodchuckClusterAverages) <- newNames$speciesOneToOne
# Make sure formatted correctly
woodchuckClusterAverages <- as.data.frame(woodchuckClusterAverages)
# Order by gene name
woodchuckClusterAverages <- woodchuckClusterAverages[order(row.names(woodchuckClusterAverages)),]
# Sanity check
head(woodchuckClusterAverages)
Correlation of woodchuck PBMCs with human 68k PBMC dataset from 10X Genomics
# Now need to read in 68K PBMC data
humanPBMC <- read.csv("~/Dropbox/Zoe/scf_version/analysis/correlationTests/68K_pbmc_data/68K_enrichedGenes.csv",
header = FALSE)
# Get rid of top row
humanPBMC <- humanPBMC[-1,]
# Separate all cell and myloid cell data
allCells <- select(humanPBMC, V1, V2, V3)
myeloid <- select(humanPBMC, V5, V6, V7)
# Rename and get rid of first row
colnames(allCells) <- c("Cluster", "Gene", "Enrichment")
colnames(myeloid) <- c("Cluster", "Gene", "Enrichment")
# Get rid of top row
allCells <- allCells[-1,]
myeloid <- myeloid[-1,]
# I can then filter the rows and bind the data frames back together by gene name
for (j in 1:10) {
dat <- dplyr::filter(allCells, Cluster == j)
#dat <- get(paste("allCells", j, sep = ""))
# Multiply enrichment values by -1 because the signs are backwards???
dat$Enrichment <- as.numeric(dat$Enrichment) * -1
dat <- dplyr::select(dat, Gene, Enrichment)
colnames(dat) <- c("Gene", paste("Enrichment", j, sep = ""))
if (j == 1) {
allCellsMatrix <- dat
}
else {
allCellsMatrix <- dplyr::full_join(allCellsMatrix, dat, by = "Gene")
}
}
# Make row names gene names
rownames(allCellsMatrix) <- allCellsMatrix$Gene
allCellsMatrix <- dplyr::select(allCellsMatrix, -Gene)
# Make column names cell types
colnames(allCellsMatrix) <- c("Activated CD8+", "Naive CD8+", "Memory and Reg T",
"Naive CD4+", "NK", "CD8+", "B", "Megakaryocytes",
"Monocytes and Dendritic", "B, Dendritic, T")
# Scale across columns
allCellsMatrix <- t(scale(t(allCellsMatrix)))
# Order genes alphabetically by gene name
allCellsMatrix <- allCellsMatrix[order(row.names(allCellsMatrix)),]
# Sanity check
head(allCellsMatrix)
Activated CD8+ Naive CD8+ Memory and Reg T Naive CD4+ NK
42430 -0.7290166 -1.4467216 -0.19073790 -0.4832807 0.02379348
42431 -0.2300094 -0.7300300 -0.26479348 -0.7995981 -0.18652940
42618 -0.1212209 -0.6309294 -0.09745206 -0.4354971 -0.36419075
A4GALT -0.4058024 -1.0280327 -0.68648235 -0.4734361 -0.11159565
AATK -1.0033368 -0.9033733 -0.11847519 -1.0810861 -0.10366579
ABCA1 -0.6939429 -1.0596759 -0.79368826 -0.8411861 -0.88868386
CD8+ B Megakaryocytes Monocytes and Dendritic
42430 -0.8928406 -0.2258430 1.314882 1.50210970
42431 -0.3474056 -0.6952459 2.578802 0.03087083
42618 -0.8527715 -0.3826776 2.691420 -0.14763062
A4GALT -0.5478332 -0.1352675 2.062829 -0.23671805
AATK -0.4257702 -0.2369504 1.958543 1.15513309
ABCA1 -0.1144697 0.6312458 1.186970 1.02547753
B, Dendritic, T
42430 1.1276549
42431 0.6439395
42618 0.3409502
A4GALT 1.5623391
AATK 0.7589817
ABCA1 1.5479533
speciesData <- "68KPBMC"
Correlation of woodchuck liver with human liver dataset from MacParland et al. (2018)
# Find cluster averages of human liver data
load("~/Dropbox/Zoe/scf_version/analysis/correlationTests/HumanLiver.RData")
# Run SCTransform
HumanLiverSeurat <- UpdateSeuratObject(HumanLiverSeurat)
Validating object structure
Updating object slots
Ensuring keys are in the proper structure
Warning: Assay RNA changing from Assay to AssayWarning: DimReduc pca changing from DimReduc to DimReducWarning: DimReduc tsne changing from DimReduc to DimReducEnsuring keys are in the proper structure
Ensuring feature names don't have underscores or pipes
Updating slots in RNA
Updating slots in pca
Updating slots in tsne
Setting tsne DimReduc to global
Validating object structure for Assay ‘RNA’
Validating object structure for DimReduc ‘pca’
Validating object structure for DimReduc ‘tsne’
Object representation is consistent with the most current Seurat version
HumanLiverSeurat <- SCTransform(HumanLiverSeurat)
Running SCTransform on assay: RNA
vst.flavor='v2' set, setting model to use fixed slope and exclude poisson genes.
Calculating cell attributes from input UMI matrix: log_umi
Total Step 1 genes: 17501
Total overdispersed genes: 14764
Excluding 2737 genes from Step 1 because they are not overdispersed.
Variance stabilizing transformation of count matrix of size 18715 by 8444
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
|
| | 0%
Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.
|
|================== | 25%
Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.
|
|=================================== | 50%
Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.
|
|==================================================== | 75%
Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.
|
|======================================================================| 100%
Setting estimate of 117 genes to inf as theta_mm/theta_mle < 1e-3
# of step1 poisson genes (variance < mean): 0
# of low mean genes (mean < 0.001): 1363
Total # of Step1 poisson genes (theta=Inf; variance < mean): 136
Total # of poisson genes (theta=Inf; variance < mean): 3896
Calling offset model for all 3896 poisson genes
Found 156 outliers - those will be ignored in fitting/regularization step
Ignoring theta inf genes
Replacing fit params for 3896 poisson genes by theta=Inf
Setting min_variance based on median UMI: 0.04
Second step: Get residuals using fitted parameters for 18715 genes
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|====== | 8%
|
|======= | 11%
|
|========= | 13%
|
|=========== | 16%
|
|============= | 18%
|
|=============== | 21%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 32%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 39%
|
|============================= | 42%
|
|=============================== | 45%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================= | 55%
|
|========================================= | 58%
|
|========================================== | 61%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 68%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 79%
|
|========================================================= | 82%
|
|=========================================================== | 84%
|
|============================================================= | 87%
|
|=============================================================== | 89%
|
|================================================================ | 92%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
Computing corrected count matrix for 18715 genes
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|====== | 8%
|
|======= | 11%
|
|========= | 13%
|
|=========== | 16%
|
|============= | 18%
|
|=============== | 21%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 32%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 39%
|
|============================= | 42%
|
|=============================== | 45%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================= | 55%
|
|========================================= | 58%
|
|========================================== | 61%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 68%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 79%
|
|========================================================= | 82%
|
|=========================================================== | 84%
|
|============================================================= | 87%
|
|=============================================================== | 89%
|
|================================================================ | 92%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
Calculating gene attributes
Wall clock passed: Time difference of 1.941124 mins
Determine variable features
Centering data matrix
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
Place corrected count matrix in counts slot
Set default assay to SCT
humanClusterAverages <- AverageExpression(HumanLiverSeurat,
assays = "SCT",
slot = "scale.data")
# Replace cluster numbers with names
colnames(humanClusterAverages$SCT) <- c("Hep 1", "Alpha-beta T cells", "Hep 2",
"Inflammatory macs", "Hep 3", "Hep 4",
"Plasma cells", "NK-like cells", "Gamma-delta T cells",
"Non-inflammatory macs", "Periportal LSECs", "Central venous LSECs",
"Portal endothelial cells", "Hep 5", "Hep 6",
"Mature B cells", "Cholangiocytes", "Gamma-delta T cells 2",
"Erythroid cells", "Hepatic stellate cells")
# If only looking at specific clusters
#humanClusterAverages$SCT <- humanClusterAverages$SCT[,c("3","1","15","6","14","5")]
# Otherwise go straight to here:
humanClusterAverages$SCT <- na.omit(t(scale(t(as.matrix(humanClusterAverages$SCT)))))
# Grab gene names
humanGenes <- row.names(humanClusterAverages$SCT)
# Now turn into large dataframe
allCellsMatrix <- as.data.frame(humanClusterAverages$SCT)
# Order by row name
allCellsMatrix <- allCellsMatrix[order(row.names(allCellsMatrix)),]
# Sanity check
head(allCellsMatrix)
speciesData <- "macparland"
Correlation of woodchuck liver with human liver dataset from Aizarani et al.
# Read in Aizarani dataset
aizarani <- readRDS("~/Dropbox/Zoe/scf_version/analysis/correlationTests/GSE124395_Normalhumanliverdata.RData")
# Read in clusters and label cells
aizaraniClusters <- read.table("~/Dropbox/Zoe/scf_version/analysis/correlationTests/GSE124395_clusterpartition.txt")
# Only keep cells in the cluster object
aizarani <- aizarani[,intersect(colnames(aizarani),row.names(aizaraniClusters))]
# Create Seurat object
aizarani <- CreateSeuratObject(counts = aizarani)
Counts matrix provided is not sparse. Creating V5 assay in Seurat Object.
Warning: Feature names cannot have underscores ('_'), replacing with dashes ('-')
# Run SCTransform
aizarani <- SCTransform(aizarani)
Running SCTransform on assay: RNA
Running SCTransform on layer: counts
Using block 2 from counts to learn model.
Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Getting residuals for block 1(of 3) for counts dataset
Getting residuals for block 2(of 3) for counts dataset
Getting residuals for block 3(of 3) for counts dataset
Centering data matrix
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
Finished calculating residuals for counts
Set default assay to SCT
# Add cluster IDs
Idents(aizarani) <- aizaraniClusters$sct.cpart
# Get cluster averages
aizaraniAverages <- AverageExpression(aizarani,
assays = "SCT",
slot = "scale.data")
aizaraniAverages$SCT <- na.omit(t(scale(t(as.matrix(aizaraniAverages$SCT)))))
# Grab gene names
aizaraniGenes <- row.names(aizaraniAverages$SCT)
# Now turn into large dataframe
allCellsMatrix <- as.data.frame(aizaraniAverages$SCT)
# Order by column
allCellsMatrix <- allCellsMatrix[,as.character(sort(as.numeric(colnames(allCellsMatrix))))]
# Rename columns to be more meaningful (not totally confident I got all correct)
colnames(allCellsMatrix) <- c('NK, NKT and T cells (1)',
'Kupffer cells (2)',
'NK, NKT and T cells (3)',
'EPCAM+ cells and cholangiocytes (4)',
'NK, NKT and T cells (5)',
'Kupffer cells (6)',
'EPCAM+ cells and cholangiocytes (7)',
'B cells (8)',
'Liver sinusoidal endothelial cells (9)',
'Macrovascular endothelial cells (10)',
'Hepatocytes (11)',
'NK, NKT and T cells (12)',
'Liver sinusoidal endothelial cells (13)',
'Hepatocytes (14)',
'Other endothelial cells (15)',
'Other (16)',
'Hepatocytes (17)',
'NK, NKT and T cells (18)',
'NK, NKT and T cells (19)',
'Liver sinusoidal endothelial cells (20)',
'Macrovascular endothelial cells (21)',
'B cells (22)',
'Kupffer cells (23)',
'EPCAM+ cells and cholangiocytes (24)',
'Kupffer cells (25)',
'Other endothelial cells (26)',
'Other (27)',
'NK, NKT and T cells (28)',
'Macrovascular endothelial cells (29)',
'Hepatocytes (30)',
'Kupffer cells (31)',
'Macrovascular endothelial cells (32)',
'Stellate cells and myofibroblasts (33)',
'B cells (34)',
'Other endothelial cells (35)',
'Other (36)',
'Other (37)',
'B cells (38)',
'EPCAM+ cells and cholangiocytes (39)')
# Order by row name
allCellsMatrix <- allCellsMatrix[order(row.names(allCellsMatrix)),]
# Sanity check
head(allCellsMatrix)
speciesData <- "aizarani"
Correlation of woodchuck liver with human liver dataset from Andrews et al. (2022)
# Load dataset
humanNuc <- readRDS("~/Dropbox/Zoe/scf_version/analysis/correlationTests/single_nuc_20_human_map.rds")
# Isolate single-nuc only
humanNuc <- UpdateSeuratObject(humanNuc)
Validating object structure
Updating object slots
Ensuring keys are in the proper structure
Warning: Assay RNA changing from Assay to AssayWarning: Graph RNA_nn changing from Graph to GraphWarning: Graph RNA_snn changing from Graph to GraphWarning: DimReduc pca changing from DimReduc to DimReducWarning: DimReduc tsne changing from DimReduc to DimReducWarning: DimReduc umap changing from DimReduc to DimReducWarning: DimReduc harmony changing from DimReduc to DimReducEnsuring keys are in the proper structure
Ensuring feature names don't have underscores or pipes
Updating slots in RNA
Updating slots in RNA_nn
Setting default assay of RNA_nn to RNA
Updating slots in RNA_snn
Setting default assay of RNA_snn to RNA
Updating slots in pca
Updating slots in tsne
Setting tsne DimReduc to global
Updating slots in umap
Setting umap DimReduc to global
Updating slots in harmony
Setting assay used for RunPCA.RNA to RNA
Setting assay used for RunUMAP.RNA.pca to RNA
Setting assay used for Seurat..ProjectDim.RNA.harmony to RNA
Setting assay used for RunUMAP.RNA.harmony to RNA
No assay information could be found for RunTSNE
Warning: Adding a command log without an assay associated with itSetting assay used for FindNeighbors.RNA.harmony to RNA
No assay information could be found for FindClusters
Warning: Adding a command log without an assay associated with itValidating object structure for Assay ‘RNA’
Validating object structure for Graph ‘RNA_nn’
Validating object structure for Graph ‘RNA_snn’
Validating object structure for DimReduc ‘pca’
Validating object structure for DimReduc ‘tsne’
Validating object structure for DimReduc ‘umap’
Validating object structure for DimReduc ‘harmony’
Object representation is consistent with the most current Seurat version
humanNuc <- subset(humanNuc, subset = assay_type == "single_nuc")
# Run SCTransform
humanNuc <- SCTransform(humanNuc)
Running SCTransform on assay: RNA
vst.flavor='v2' set, setting model to use fixed slope and exclude poisson genes.
Calculating cell attributes from input UMI matrix: log_umi
Total Step 1 genes: 10432
Total overdispersed genes: 10429
Excluding 3 genes from Step 1 because they are not overdispersed.
Variance stabilizing transformation of count matrix of size 10432 by 43863
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
|
| | 0%
Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.
|
|================== | 25%
Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.
|
|=================================== | 50%
Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.
|
|==================================================== | 75%
Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.
|
|======================================================================| 100%
Setting estimate of 97 genes to inf as theta_mm/theta_mle < 1e-3
# of step1 poisson genes (variance < mean): 0
# of low mean genes (mean < 0.001): 0
Total # of Step1 poisson genes (theta=Inf; variance < mean): 97
Total # of poisson genes (theta=Inf; variance < mean): 3
Calling offset model for all 3 poisson genes
Found 179 outliers - those will be ignored in fitting/regularization step
Ignoring theta inf genes
Replacing fit params for 3 poisson genes by theta=Inf
Setting min_variance based on median UMI: 0.04
Second step: Get residuals using fitted parameters for 10432 genes
|
| | 0%
|
|=== | 5%
|
|======= | 10%
|
|========== | 14%
|
|============= | 19%
|
|================= | 24%
|
|==================== | 29%
|
|======================= | 33%
|
|=========================== | 38%
|
|============================== | 43%
|
|================================= | 48%
|
|===================================== | 52%
|
|======================================== | 57%
|
|=========================================== | 62%
|
|=============================================== | 67%
|
|================================================== | 71%
|
|===================================================== | 76%
|
|========================================================= | 81%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|=================================================================== | 95%
|
|======================================================================| 100%
Computing corrected count matrix for 10432 genes
|
| | 0%
|
|=== | 5%
|
|======= | 10%
|
|========== | 14%
|
|============= | 19%
|
|================= | 24%
|
|==================== | 29%
|
|======================= | 33%
|
|=========================== | 38%
|
|============================== | 43%
|
|================================= | 48%
|
|===================================== | 52%
|
|======================================== | 57%
|
|=========================================== | 62%
|
|=============================================== | 67%
|
|================================================== | 71%
|
|===================================================== | 76%
|
|========================================================= | 81%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|=================================================================== | 95%
|
|======================================================================| 100%
Calculating gene attributes
Wall clock passed: Time difference of 1.578703 mins
Determine variable features
Centering data matrix
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
Place corrected count matrix in counts slot
Set default assay to SCT
# Change idents
Idents(humanNuc) <- humanNuc@meta.data$sub_annotation
# Get cluster averages
humanNucClustAverages <- AverageExpression(humanNuc,
assays = "SCT",
slot = "scale.data")
Warning: `invoke()` is deprecated as of rlang 0.4.0.
humanNucClustAverages$SCT <- na.omit(t(scale(t(as.matrix(humanNucClustAverages$SCT)))))
# Grab gene names
nucGenes <- row.names(humanNucClustAverages$SCT)
# Now turn into large dataframe
allCellsMatrix <- as.data.frame(humanNucClustAverages$SCT)
# Order by row name
allCellsMatrix <- allCellsMatrix[order(row.names(allCellsMatrix)),]
# Sanity check
head(allCellsMatrix)
speciesData <- "andrews"
Correlation of woodchuck liver with woodchuck PBMCs. For this correlation, read in the woodchuck liver dataset at the beginning of this script and then read in the woodchuck PBMCs below
groups <- "general_cell_labels"
# Start with liver and read in woodchuck PBMCs again
load("~/Dropbox/Zoe/scf_version/analysis/healthy_sc/seurat_objects/no_dropletQC/integrated_PBMC_cca_kanchor5_scClustViz.RData")
Idents(scSeurat) <- "integrated_snn_res.0.6"
# Find cluster averages
pbmcClusterAverages <- AverageExpression(scSeurat,
assays = "SCT",
slot = "scale.data",
group.by = groups)
pbmcClusterAverages <- as.data.frame(na.omit(t(scale(t(as.matrix(pbmcClusterAverages$SCT))))))
# Order by row name
allCellsMatrix <- pbmcClusterAverages[order(row.names(pbmcClusterAverages)),]
speciesData <- "PBMC"
# Now find intersecting genes
matches <- intersect(row.names(allCellsMatrix),
row.names(woodchuckClusterAverages))
# Look at how many genes matched
length(matches)
[1] 630
# Make new matrices with only matching gene names
toCor <- allCellsMatrix[matches,]
woodchuckAveragesCor <- woodchuckClusterAverages[matches,]
# Do Pearson
pearVal <- cor(toCor, woodchuckAveragesCor, method = "pearson")
heatmap(pearVal)
#main = paste("Pearson correlation of", speciesData, "vs", woodchuckData),
#xlab = woodchuckData,
#ylab = speciesData)
#margins = c(6,11))
#Rowv = NA,
#Colv = NA)
pdf(paste("./figures/", tissue, "/", tissue, "_", speciesData, "_PearsonCor.pdf", sep = ""),
height = 15, width = 13)
heatmap(pearVal, margins = c(13,13))
dev.off()
png
2
# Do Spearman
spearVal <- cor(toCor, woodchuckAveragesCor, method = "spearman")
heatmap(spearVal)
#main = paste("Spearman correlation of", speciesData, "vs", woodchuckData),
#xlab = woodchuckData,
#ylab = speciesData)
#margins = c(6,11))
#Rowv = NA,
#Colv = NA)
pdf(paste("./figures/", tissue, "/", tissue, "_", speciesData, "_SpearmanCor.pdf", sep = ""),
height = 15, width = 13)
heatmap(spearVal, margins = c(13,13))
dev.off()
png
2